{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create entry points to spark\n",
    "try:\n",
    "    sc.stop()\n",
    "except:\n",
    "    pass\n",
    "from pyspark import SparkContext, SparkConf\n",
    "from pyspark.sql import SparkSession\n",
    "sc=SparkContext()\n",
    "spark = SparkSession(sparkContext=sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `pyspark.sql.functions` functions\n",
    "\n",
    "`pyspark.sql.functions` is collection of built-in functions for **creating column expressions**. These functions largely increase methods that we can use to manipulate DataFrame and DataFrame columns.\n",
    "\n",
    "There are many sql functions from the `pyspark.sql.functions` module. Here I only choose a few to show how these functions extend the ability to create column expressions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import functions as F"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `abs()`: create column expression that returns absolute values of a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+\n",
      "|  x|\n",
      "+---+\n",
      "|  1|\n",
      "| -1|\n",
      "| -2|\n",
      "+---+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import Row\n",
    "df = sc.parallelize([Row(x=1), Row(x=-1), Row(x=-2)]).toDF()\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Column<b'abs(x)'>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_abs = F.abs(df.x)\n",
    "x_abs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------+\n",
      "|  x|abs(x)|\n",
      "+---+------+\n",
      "|  1|     1|\n",
      "| -1|     1|\n",
      "| -2|     2|\n",
      "+---+------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(df.x, x_abs).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `concat()`: create column expression that concatenates multiple column values into one "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+\n",
      "|     a|      b|\n",
      "+------+-------+\n",
      "| apple|   tree|\n",
      "|orange|flowers|\n",
      "+------+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = sc.parallelize([Row(a='apple', b='tree'), Row(a='orange', b='flowers')]).toDF()\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Column<b'concat(a, b)'>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ab_concat = F.concat(df.a, df.b)\n",
    "ab_concat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+-------------+\n",
      "|     a|      b| concat(a, b)|\n",
      "+------+-------+-------------+\n",
      "| apple|   tree|    appletree|\n",
      "|orange|flowers|orangeflowers|\n",
      "+------+-------+-------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(df.a, df.b, ab_concat).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `corr()`: create column expression that returns pearson correlation coefficient between two columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|              _c0| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
      "|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
      "|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
      "|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|\n",
      "|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|\n",
      "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)\n",
    "mtcars.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Column<b'corr(drat, wt)'>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drat_wt_corr = F.corr(mtcars.drat, mtcars.wt)\n",
    "drat_wt_corr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+\n",
      "|     corr(drat, wt)|\n",
      "+-------------------+\n",
      "|-0.7124406466973717|\n",
      "+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.select(drat_wt_corr).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## `array()`: create column expression that merge multiple column values into an array\n",
    "\n",
    "This function can be used to build **feature column** in machine learning models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Column<b'mpg'>,\n",
       " Column<b'cyl'>,\n",
       " Column<b'disp'>,\n",
       " Column<b'hp'>,\n",
       " Column<b'drat'>,\n",
       " Column<b'wt'>,\n",
       " Column<b'qsec'>,\n",
       " Column<b'vs'>,\n",
       " Column<b'am'>,\n",
       " Column<b'gear'>,\n",
       " Column<b'carb'>]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols = [eval('mtcars.' + col) for col in mtcars.columns[1:]]\n",
    "cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Column<b'array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)'>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cols_array = F.array(cols)\n",
    "cols_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------------------------------------------------------+\n",
      "|array(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)    |\n",
      "+-----------------------------------------------------------------+\n",
      "|[21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]  |\n",
      "|[21.0, 6.0, 160.0, 110.0, 3.9, 2.875, 17.02, 0.0, 1.0, 4.0, 4.0] |\n",
      "|[22.8, 4.0, 108.0, 93.0, 3.85, 2.32, 18.61, 1.0, 1.0, 4.0, 1.0]  |\n",
      "|[21.4, 6.0, 258.0, 110.0, 3.08, 3.215, 19.44, 1.0, 0.0, 3.0, 1.0]|\n",
      "|[18.7, 8.0, 360.0, 175.0, 3.15, 3.44, 17.02, 0.0, 0.0, 3.0, 2.0] |\n",
      "|[18.1, 6.0, 225.0, 105.0, 2.76, 3.46, 20.22, 1.0, 0.0, 3.0, 1.0] |\n",
      "|[14.3, 8.0, 360.0, 245.0, 3.21, 3.57, 15.84, 0.0, 0.0, 3.0, 4.0] |\n",
      "|[24.4, 4.0, 146.7, 62.0, 3.69, 3.19, 20.0, 1.0, 0.0, 4.0, 2.0]   |\n",
      "|[22.8, 4.0, 140.8, 95.0, 3.92, 3.15, 22.9, 1.0, 0.0, 4.0, 2.0]   |\n",
      "|[19.2, 6.0, 167.6, 123.0, 3.92, 3.44, 18.3, 1.0, 0.0, 4.0, 4.0]  |\n",
      "|[17.8, 6.0, 167.6, 123.0, 3.92, 3.44, 18.9, 1.0, 0.0, 4.0, 4.0]  |\n",
      "|[16.4, 8.0, 275.8, 180.0, 3.07, 4.07, 17.4, 0.0, 0.0, 3.0, 3.0]  |\n",
      "|[17.3, 8.0, 275.8, 180.0, 3.07, 3.73, 17.6, 0.0, 0.0, 3.0, 3.0]  |\n",
      "|[15.2, 8.0, 275.8, 180.0, 3.07, 3.78, 18.0, 0.0, 0.0, 3.0, 3.0]  |\n",
      "|[10.4, 8.0, 472.0, 205.0, 2.93, 5.25, 17.98, 0.0, 0.0, 3.0, 4.0] |\n",
      "|[10.4, 8.0, 460.0, 215.0, 3.0, 5.424, 17.82, 0.0, 0.0, 3.0, 4.0] |\n",
      "|[14.7, 8.0, 440.0, 230.0, 3.23, 5.345, 17.42, 0.0, 0.0, 3.0, 4.0]|\n",
      "|[32.4, 4.0, 78.7, 66.0, 4.08, 2.2, 19.47, 1.0, 1.0, 4.0, 1.0]    |\n",
      "|[30.4, 4.0, 75.7, 52.0, 4.93, 1.615, 18.52, 1.0, 1.0, 4.0, 2.0]  |\n",
      "|[33.9, 4.0, 71.1, 65.0, 4.22, 1.835, 19.9, 1.0, 1.0, 4.0, 1.0]   |\n",
      "+-----------------------------------------------------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mtcars.select(cols_array).show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
